Clue dataset

Individual indicators

load(paste0(IO$output_clue, "users.Rdata"), verbose = TRUE)
## Loading objects:
##   users
load(paste0(IO$output_clue, "cycles.Rdata"),verbose = TRUE)
## Loading objects:
##   cycles
load(paste0(IO$output_clue, "tracking.Rdata"), verbose = TRUE)
## Loading objects:
##   tracking
for(user_id in users$user_id[1:10]){
  t = tracking[tracking$user_id == user_id,]

  g = ggplot_user_tracking(t)
  print(g)
  g = ggplot_user_tracking(t, fertility_only = TRUE)
  print(g)
}
## Warning: Removed 280 rows containing missing values (geom_vline).
## Warning: Removed 174 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_vline).

## Warning: Removed 78 rows containing missing values (geom_vline).

## Warning: Removed 16 rows containing missing values (geom_vline).

## Warning: Removed 928 rows containing missing values (geom_vline).
## Warning: Removed 36 rows containing missing values (geom_point).

## Warning: Removed 78 rows containing missing values (geom_vline).

## Warning: Removed 344 rows containing missing values (geom_vline).
## Warning: Removed 21 rows containing missing values (geom_point).

## Warning: Removed 30 rows containing missing values (geom_vline).

## Warning: Removed 2144 rows containing missing values (geom_vline).
## Warning: Removed 241 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_vline).

## Warning: Removed 112 rows containing missing values (geom_vline).
## Warning: Removed 104 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_vline).

## Warning: Removed 12600 rows containing missing values (geom_vline).
## Warning: Removed 5 rows containing missing values (geom_point).

## Warning: Removed 669 rows containing missing values (geom_vline).

## Warning: Removed 36 rows containing missing values (geom_vline).

## Warning: Removed 10 rows containing missing values (geom_vline).

## Warning: Removed 42 rows containing missing values (geom_vline).
## Warning: Removed 193 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing missing values (geom_vline).

## Warning: Removed 712 rows containing missing values (geom_vline).
## Warning: Removed 177 rows containing missing values (geom_point).

## Warning: Removed 72 rows containing missing values (geom_vline).

load(paste0(IO$tmp_clue,"users_aggregate_data_prep.Rdata"), verbose = TRUE)
## Loading objects:
##   users_aggregate
date_seq = seq(par$start_date, par$end_date, by = 1)

indiv_indicators = data.frame(user_id = rep(unique(tracking$user_id), each = length(date_seq)),
                              date = rep(date_seq, lu(tracking$user_id)), stringsAsFactors = FALSE)


m = match(indiv_indicators$user_id, users$user_id)
indiv_indicators$country = users$country[m]
indiv_indicators$age_cat = users$age_cat[m]
indiv_indicators$age = users$age[m]
indiv_indicators$bmi_cat = users$bmi_cat[m]
indiv_indicators$bmi = users$bmi[m]


indiv_indicators$active_use = FALSE

users_aggregate = users_aggregate[which(users_aggregate$user_id %in% unique(indiv_indicators$user_id)),]
for(user in unique(indiv_indicators$user_id)){
  j = which(users_aggregate$user_id == user)
  dates_active = seq(users_aggregate$first_obs_date[j], users_aggregate$last_obs_date[j],by =1)
  k = which((indiv_indicators$user_id == user) & (indiv_indicators$date %in% dates_active))
  indiv_indicators$active_use[k] = TRUE
}


indiv_indicators$day_id = paste0(indiv_indicators$user_id, "_", indiv_indicators$date)


# cycle_id
cycles_exp = as.data.frame(lapply(cycles, rep, cycles$cycle_length))
cycles_exp$cycleday = ave(rep(1,nrow(cycles_exp)), cycles_exp$cycle_id, FUN =cumsum)
cycles_exp$date = cycles_exp$cycle_start + (cycles_exp$cycleday - 1)
cycles_exp$day_id = paste0(cycles_exp$user_id, "_", cycles_exp$date)
  
# match indiv_indicators and cycles_sub_exp
m = match(indiv_indicators$day_id, cycles_exp$day_id)
indiv_indicators$cycle_nb = cycles_exp$cycle_nb[m]
indiv_indicators$cycle_id = cycles_exp$cycle_id[m]
indiv_indicators$cycle_length = cycles_exp$cycle_length[m]
indiv_indicators$cycleday = cycles_exp$cycleday[m]
  
indiv_indicators$cycleday_from_end = indiv_indicators$cycleday - indiv_indicators$cycle_length - 1


# time num 
indiv_indicators$time_num = par$time_num[match(indiv_indicators$date, par$date_seq)]


save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata"  ))

Sex

# unprotected sex

Fertility

# bleeding
tracking_bleeding = tracking[tracking$category == "period",]
indiv_indicators$bleeding = as.numeric(
  factor(tracking_bleeding$type[match(indiv_indicators$day_id, tracking_bleeding$day_id)],
                                   levels = c("spotting","light","medium","heavy")))

indiv_indicators$bleeding[is.na(indiv_indicators$bleeding)] = 0
rm(tracking_bleeding)

# mucus
mucus_types = c("creamy","sticky","egg_white")
tracking_mucus = tracking[tracking$type %in% mucus_types,]
indiv_indicators$mucus = tracking_mucus$type[match(indiv_indicators$day_id, tracking_mucus$day_id)]
indiv_indicators$mucus_num = as.numeric(factor(indiv_indicators$mucus, levels = mucus_types))
indiv_indicators$mucus_num[is.na(indiv_indicators$mucus_num)] = 0
rm(tracking_mucus)


# ovulation day
indiv_indicators$ovu_day = FALSE
indiv_indicators$ovu_day[indiv_indicators$cycleday_from_end == -13] = TRUE

# daily fertility
fertility_pattern = c(0.05,0.1,0.15,0.2,0.3,0.2,0.1)
indiv_indicators$daily_fertility = 0
k = which(indiv_indicators$ovu_day)
i = numeric(0)
for(j in k){
  i = c(i, (j-5):(j+1))
}
indiv_indicators$daily_fertility[i] = rep(fertility_pattern, length(k))




ggplot(indiv_indicators[indiv_indicators$user_id %in% unique(indiv_indicators$user_id)[1:10],], 
       aes(x = date, y = daily_fertility, col = user_id)) + 
  geom_line() + guides(col = FALSE)+
  facet_grid(user_id ~ . )

save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata"  ))
file.copy(from = paste0(IO$output_clue, "indiv_indicators.Rdata"  ), 
          to = paste0(IO$tmp_clue, "indiv_indicators_with_fertility.Rdata"  ))
## [1] TRUE
agg = ddply(indiv_indicators,
            .(cycle_id),
            .fun = summarize,
            n_eggwhite = sum(mucus == "egg_white", na.rm = TRUE),
            mucus_quality = max(mucus_num, na.rm = TRUE),
            n_bleeding_after_day_5 = sum((bleeding >=3) & (cycleday >5), na.rm = TRUE),
            n_heavy_bleeding = sum(bleeding ==4,na.rm = TRUE)
)

indiv_indicators = merge(indiv_indicators, agg, by = "cycle_id", all.x = TRUE)
indiv_indicators = indiv_indicators[order(indiv_indicators$o),]

# regularity
cycles = cycles[order(cycles$user_id, cycles$cycle_nb),]
cycles$cycle_id_next = paste0(cycles$user_id, "_", cycles$cycle_nb+1)
cycles$cycle_id_next2 = paste0(cycles$user_id, "_", cycles$cycle_nb+2)
cycles$cycle_length_next = cycles$cycle_length[match(cycles$cycle_id_next, cycles$cycle_id)]
cycles$cycle_length_next2 = cycles$cycle_length[match(cycles$cycle_id_next2, cycles$cycle_id)]

cycles$sd_cycle_length = apply(cycles[,grep("cycle_length",colnames(cycles))], 1, sd, na.rm = TRUE)
cycles$sd_cycle_length = replace_NAs_with_latest_value(cycles$sd_cycle_length)


indiv_indicators$sd_cycle_length = cycles$sd_cycle_length[match(indiv_indicators$cycle_id, cycles$cycle_id)]

save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata"  ))
file.copy(from = paste0(IO$output_clue, "indiv_indicators.Rdata"  ), 
          to = paste0(IO$tmp_clue, "indiv_indicators_with_cycle_fertility.Rdata"  ))
## [1] TRUE
indiv_indicators$cycle_fertility = 1/6*(
  sigmoid(indiv_indicators$mucus_quality,1.5,1) +
  1-sigmoid(abs(indiv_indicators$n_eggwhite - 3 ),3,1)+
  exp(-0.25*indiv_indicators$n_bleeding_after_day_5) +
  1 - sigmoid(indiv_indicators$n_heavy_bleeding,5,1.5)+
  1 - sigmoid(abs(28.5 - indiv_indicators$cycle_length),4,1)+
  1 - sigmoid(indiv_indicators$sd_cycle_length, 10,0.5)
)



indiv_indicators$overall_fertility = 1/2*(
  1 - sigmoid(abs(30 - indiv_indicators$age),15,0.3)+
  1 - 0.5*sigmoid(abs(25 - indiv_indicators$bmi),6,1)
)



indiv_indicators$fertility  = indiv_indicators$daily_fertility * 
  indiv_indicators$cycle_fertility * indiv_indicators$overall_fertility


save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata"  ))
file.copy(from = paste0(IO$output_clue, "indiv_indicators.Rdata"  ), 
          to = paste0(IO$tmp_clue, "indiv_indicators_with_overall_fertility.Rdata"  ))
## [1] FALSE
ggplot(indiv_indicators[indiv_indicators$user_id %in% unique(indiv_indicators$user_id)[1:10],], 
       aes(x = date, y = fertility, col = user_id)) + 
  geom_line() + guides(col = FALSE)+
  facet_grid(user_id ~ . )
## Warning: Removed 3830 rows containing missing values (geom_path).

Health

Population indicators

par$agg_col
## [1] "country" "age_cat" "bmi_cat"

We build the population indicators by aggregating by the following columns {r par$agg_col}

pop_indicators = ddply(indiv_indicators, .variable = c("date","time_num",par$agg_col), 
            .fun = summarize,
            n_users = sum(active_use)
)

pop_indicators = pop_indicators[order(pop_indicators$country,
                                      pop_indicators$age_cat,
                                      pop_indicators$bmi_cat,
                                      pop_indicators$time_num),]

pop_indicators$o = 1:nrow(pop_indicators)

save(pop_indicators, file = paste0(IO$output_clue,"pop_indicators.Rdata"))
file.copy(from = paste0(IO$output_clue,"pop_indicators.Rdata"), to = paste0(IO$tmp_clue,"pop_indicators_empty.Rdata"), overwrite = TRUE)
## [1] TRUE

Sex

agg = ddply(tracking, .variable = c("date",par$agg_col), 
                    .fun = summarize,
                    n_tot_sex = sum(type %in% c("withdrawal_sex","protected_sex","unprotected_sex")),
                    n_prot_sex = sum(type == "protected_sex"),
                    n_unprot_sex =  sum(type == "unprotected_sex"),
                    n_withdrawal =  sum(type == "withdrawal_sex")
  )

ggplot(agg, aes(x = date, y = n_tot_sex, col = age_cat, linetype = bmi_cat))  + geom_line() + facet_grid(country ~., scale = "free_y") + xlim(c(as.Date("2014-12-31"),as.Date("2018-01-01")))
## Warning: Removed 348 rows containing missing values (geom_path).

Needs to be divided by # of active user at each date

Fertility

agg = ddply(indiv_indicators, 
            .variable = c("date",par$agg_col), 
            .fun = summarize,
            daily_fertility_sum = sum(daily_fertility))

pop_indicators = merge(pop_indicators, agg, all.x = TRUE)
pop_indicators = pop_indicators[order(pop_indicators$o),]

pop_indicators$daily_fertility = pop_indicators$daily_fertility_sum / pop_indicators$n_users 

ggplot(pop_indicators, aes(x = date, y = daily_fertility, col = age_cat, linetype = bmi_cat))  + geom_line() + facet_grid(country ~., scale = "free_y")
## Warning: Removed 403 rows containing missing values (geom_path).

save(pop_indicators, file = paste0(IO$output_clue,"pop_indicators.Rdata"))
file.copy(from = paste0(IO$output_clue,"pop_indicators.Rdata"), to = paste0(IO$tmp_clue,"pop_indicators_with_fertility.Rdata"), overwrite = TRUE)
## [1] TRUE

Health